library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
library(rworldmap)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between a country’s population level, population density, and continent categorization on the spread of COVID-19.
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X" "X.1"
[4] "X.2" "X.3" "X.4"
[7] "X.5" "X.6" "X.7"
[10] "X.8" "X.9" "X.10"
[13] "X.11" "X.12" "X.13"
[16] "X.14" "X.15" "X.16"
[19] "X.17" "X.18" "X.19"
[22] "X.20" "X.21" "X.22"
[25] "X.23" "X.24" "X.25"
[28] "X.26" "X.27" "X.28"
[31] "X.29" "X.30" "X.31"
[34] "X.32" "X.33" "X.34"
[37] "X.35" "X.36" "X.37"
[40] "X.38" "X.39" "X.40"
[43] "X.41" "X.42" "X.43"
[46] "X.44" "X.45" "X.46"
[49] "X.47" "X.48" "X.49"
[52] "X.50" "X.51" "X.52"
[55] "X.53" "X.54" "X.55"
[58] "X.56" "X.57" "X.58"
[61] "X.59" "X.60" "X.61"
[64] "X.62" "X.63" "X.64"
[67] "X.65" "X.66" "X.67"
[70] "X.68" "X.69" "X.70"
[73] "X.71" "X.72" "X.73"
[76] "X.74" "X.75" "X.76"
[79] "X.77" "X.78" "X.79"
[82] "X.80" "X.81" "X.82"
[85] "X.83" "X.84" "X.85"
[88] "X.86" "X.87" "X.88"
[91] "X.89" "X.90" "X.91"
[94] "X.92" "X.93" "X.94"
[97] "X.95" "X.96" "X.97"
[100] "X.98" "X.99" "X.100"
[103] "X.101" "X.102" "X.103"
[106] "X.104" "X.105" "X.106"
[109] "X.107" "X.108" "X.109"
[112] "X.110" "X.111" "X.112"
[115] "X.113" "X.114" "X.115"
[118] "X.116" "X.117" "X.118"
[121] "X.119" "X.120" "X.121"
[124] "X.122" "X.123" "X.124"
[127] "X.125" "X.126" "X.127"
[130] "X.128" "X.129" "X.130"
[133] "X.131" "X.132" "X.133"
[136] "X.134" "X.135" "X.136"
[139] "X.137" "X.138" "X.139"
[142] "X.140" "X.141" "X.142"
[145] "X.143" "X.144" "X.145"
[148] "X.146" "X.147" "X.148"
[151] "X.149" "X.150" "X.151"
[154] "X.152" "X.153" "X.154"
[157] "X.155" "X.156" "X.157"
[160] "X.158" "X.159" "X.160"
[163] "X.161" "X.162" "X.163"
[166] "X.164" "X.165" "X.166"
[169] "X.167" "X.168" "X.169"
[172] "X.170" "X.171" "X.172"
[175] "X.173" "X.174" "X.175"
[178] "X.176" "X.177" "X.178"
[181] "X.179" "X.180" "X.181"
[184] "X.182" "X.183" "X.184"
[187] "X.185" "X.186" "X.187"
[190] "X.188" "X.189" "X.190"
[193] "X.191" "X.192" "X.193"
[196] "X.194" "X.195" "X.196"
[199] "X.197" "X.198" "X.199"
[202] "X.200" "X.201" "X.202"
[205] "X.203" "X.204" "X.205"
[208] "X.206" "X.207" "X.208"
[211] "X.209" "X.210" "X.211"
[214] "X.212" "X.213" "X.214"
[217] "X.215" "X.216" "X.217"
[220] "X.218" "X.219" "X.220"
[223] "X.221" "X.222" "X.223"
[226] "X.224" "X.225" "X.226"
[229] "X.227" "X.228" "X.229"
[232] "X.230" "X.231" "X.232"
[235] "X.233" "X.234" "X.235"
[238] "X.236" "X.237" "X.238"
[241] "X.239" "X.240" "X.241"
[244] "X.242" "X.243" "X.244"
[247] "X.245" "X.246" "X.247"
[250] "X.248" "X.249" "X.250"
[253] "X.251" "X.252" "X.253"
[256] "X.254"
COVID %>%
head()
The original COVID data set is clearly in need of some data wrangling– it has an abundance of empty columns along with improper column headings.
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth" "birth" "death"
[7] "migr" "maternal" "infant" "life" "fert" "health"
[13] "HIVrate" "HIVpeople" "HIVdeath" "obesity" "underweight" "educ"
[19] "unemploymentYouth" "GDP" "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax" "budget" "debt"
[31] "inflation" "discount" "lending" "narrow" "broad" "credit"
[37] "shares" "balance" "exports" "imports" "gold" "externalDebt"
[43] "homeStock" "abroadStock" "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro" "elecRenew" "oilProd"
[55] "oilExp" "oilImp" "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp" "gasImp" "gasRes"
[67] "mainlines" "cell" "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
CountryData is tidy, but in its current form, it contains many variables nonrelevant to our analysis– we will extract the relevant factors (country, area, pop).
countryRegions
countryRegions %>%
nrow()
[1] 254
countryRegions %>%
names()
[1] "ISO3" "ADMIN" "REGION" "continent" "GEO3major" "GEO3" "IMAGE24" "GLOCAF"
[9] "Stern" "SRESmajor" "SRES" "GBD" "AVOIDnumeric" "AVOIDname" "LDC" "SID"
[17] "LLDC"
countryRegions %>%
head()
The countryRegions data set is tidy, but it contains many variables nonrelevant to our analysis– we will extract the relevant factors (ISO3, REGION).
COVID
Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time. We rename to columns to descriptive titles, and we convert values to usable form.
TidyCOVID <- COVID %>%
rename(country = total.covid.cases.deaths.per.million ) %>%
rename( code = X ) %>%
rename(date = X.1 ) %>%
rename(casesPerMillion = X.3) %>%
filter(row_number() > 1) %>%
subset(select = c(1,2,3,5)) %>%
mutate( country = as.character(country) ) %>%
mutate( code = as.character(code) ) %>%
mutate(date = mdy(date)) %>%
mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
The Tidy COVID dataset for our analysis.
TidyCOVID
Each instance in TidyCOVID represents a different day in a country’s progression through COVID-19. It provides the country code (which will be later utilized to assign continent categorization), the date, and the total cases per million up at that date.
We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.
Labels <-
countryRegions %>%
subset(select = c("ISO3", "REGION")) %>%
rename(continent = REGION)
Labels
We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people). From these attributes we calculate the country’s popdensity (person/sq km).
RelevantCountryData <-
CountryData %>%
subset(select = c(1,2,3)) %>%
mutate(popdensity = pop/area)
RelevantCountryData
At this point, we join the tidied COVID data set with the extracted CountryData set such that each case represents a different day in a country’s progression through COVID-19, providing the specific country, date, total cases per million up at that date, the area of the country (sq km), the population of the country, the population density of the countrym the total number of cases (derived by multiplying a country’s casesPerMillion by the population (in millions)), and the continent categorization.
COVIDGrowth <-
inner_join(TidyCOVID, RelevantCountryData, by = c("country")) %>%
mutate("cases" = (casesPerMillion * round(pop/1000000, digits = 0)))
COVIDGrowth <-
COVIDGrowth %>%
left_join(Labels, by = c("code" = "ISO3"))
COVIDGrowth <-
COVIDGrowth %>%
subset(select = c(1, 3, 4, 5, 6, 7, 8, 9))
COVIDGrowth
This new data table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize when countries first became infected.
FirstInstance <-
COVIDGrowth %>%
filter(cases != 0) %>%
group_by(country, continent) %>%
summarise(beginningofspread = min(date))
FirstInstance
The DailySpread data frame utilizes the COVIDGrowth data along with the beginningofspread variable (which was derived in the FirstInstance table) in order to calculate a straight-line approximation of the daily spread of COVID-19, averaging cases over time from the first day a country was infected to the most recent date in the data table (April 5 2020). If a country has not been infected, the dailyspread is set to 0.
DailySpread <-
left_join(COVIDGrowth, FirstInstance, by = c("country")) %>%
filter(date == "2020-04-05") %>%
mutate(dayselapsed = date - beginningofspread) %>%
mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>%
mutate(dailyspreadpermillion = casesPerMillion / as.numeric(dayselapsed) ) %>%
subset(select = c("country", "beginningofspread", "dailyspread", "dailyspreadpermillion"))
DailySpread$dailyspread[is.na(DailySpread$dailyspread)] <- 0
DailySpread$dailyspreadpermillion[is.na(DailySpread$dailyspreadpermillion)] <- 0
DailySpread
Joining the growth-calculated COVID data set with the DailySpread statistics allows us our comprehensive data frame with which we will conduct our analysis. Within this frame, each case represents a different day in a country’s progression through COVID-19, providing the specific country, date, total cases per million up at that date, the area of the country (sq km), the population of the country, the population density of the countrym the total number of cases (derived by multiplying a country’s casesPerMillion by the population (in millions)), continent categorization, date which the spread of COVID-19 began, the average daily spread of COVID-19, and the average daily spread of COVID-19 per million of population.
COVIDFinal <-
left_join(COVIDGrowth, DailySpread, by = c("country"))
COVIDFinal
COVIDFinal %>%
group_by(date) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point() +
xlab("Date") +
ylab("COVID-19 Cases")
This graph demonstrates the exponential growth trend of COVID-19 globally. There is a strong positive correlation between the progression of time and then number of COVID-19 cases worldwide.
na.omit(COVIDFinal) %>%
group_by(date, continent) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point() +
facet_wrap(~continent) +
xlab("Date") +
ylab("COVID-19 Cases")
This graph shows the growth of COVID-19 cases over time, faceted by continent. The global exponential trend is most visible in the origin continent of Asia, but the positive correlation of COVID-19 over time is visible in each continents.
na.omit(FirstInstance) %>%
ggplot(aes(x = beginningofspread, fill = continent)) +
geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions="all") +
xlab("Country's First Case of COVID-19") +
theme(panel.background = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank())
This graph shows the progression of the COVID-19 spread across continents. As supported and congruent with the faceted graph of continental growth over time, Asia was the first continent to be infected, followed by North America and Europe. In late February, South America and Africa began to become infected, and Australia was able to isolate until mid March. Where continental datapoints show density alligns with, in the graphic above, where the continents experience greatest periods of growth.
COVIDFinal %>%
group_by(country) %>%
summarise(dailyspread = mean(dailyspread)) %>%
arrange(desc(dailyspread)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Average Number Infected Per Day") +
theme(axis.title.x = element_blank())
This graphic demonstrates the countries with the highest infection rates– those most significant of which are China, India, Indonesia, and the United States, closely followed by Brazil, after which the rate of infection tapers off to a comparatively similar rate.
COVIDFinal %>%
group_by(country) %>%
summarise(pop = mean(pop)) %>%
arrange(desc(pop)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(pop)), y= pop)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Population") +
theme(axis.title.x = element_blank())
We see a similar trend here, in that the countries of top population (in slightly different order) consist of China, India, Indonesia, the United States, and closely trailing Brazil, after which population seems to taper off to relatively similar levels.
na.omit(COVIDFinal) %>%
ggplot(aes(x = pop, y = dailyspread, color = continent)) +
geom_point() +
xlab("Population of Country") +
ylab("Average Number Infected Per Day")
This dataframe clearly demonstrates the strong positive correlation between a country’s population and average number infected per day, as supported by our analysis of the previous two graphics. While most countries of the world trail with under 10,000 infected per day, the 5 countries with the highest popualtion in the data set have over 15,000 - up to over 50,000- and are far separated from the rest of the pack. Population, while not a direct factor contributing to the level of development of a country, is a decent indicator of the rate of infection.
na.omit(COVIDFinal) %>%
ggplot(aes(x = pop, y = dailyspread, color = continent)) +
geom_point() +
xlim(0,500000000) +
ylim(0, 40000) +
xlab("Population of Country") +
ylab("Average Number Infected Per Day") +
stat_smooth(method = lm)
Removing the largest outliers of population and number infected, the poitive relationship between average number infected and population still holds; for each continent, the trends are very clearly upward sloping.
COVIDFinal %>%
group_by(country) %>%
summarise(dailyspreadpermillion = mean(dailyspreadpermillion)) %>%
arrange(desc(dailyspreadpermillion)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspreadpermillion)), y= dailyspreadpermillion)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Population Per Million Infected Per Day") +
theme(axis.title.x = element_blank())
According the this graph, those with the highest infection rates with the confounding factor of population removed (since it is population per million, it has been adjusted to put all countries on one scale) are Guinea-Bissau, Botswana, Eritrea, El Salvador, and Puerto Rico.
COVIDFinal %>%
group_by(country) %>%
summarise(popdensity = mean(popdensity)) %>%
arrange(desc(popdensity)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(popdensity)), y= popdensity)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Population Density (people/sq km)") +
theme(axis.title.x = element_blank())
According the this graph, those with the highest population density (number of people per square kilometer) are Monaco, Singapore, and Gibraltar.
na.omit(COVIDFinal) %>%
ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
geom_point()
On this graph, it is chalenging to see a clear relationship between population density and daily spread of COVID-19 per million. For the sake of comprehensive analysis, we will further divide the information.
na.omit(COVIDFinal) %>%
ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
geom_point() +
facet_wrap(~continent) +
xlim(0,1500)
Again, it is hard to see a perfect correlaiton here. In general, all the graphs show little to no correlation between the population density and the daily spread per million.
WideCountries <-
COVIDFinal %>%
subset(select = c("country", "date", "cases")) %>%
spread(key = date, value = cases)
WideCountries[is.na(WideCountries)] <- 0
WideCountries
compareCOVID <- function(countryA, countryB) {
A <-
WideCountries %>%
filter(country == countryA)
B <-
WideCountries %>%
filter(country == countryB)
A <-
A %>%
gather(key = date, value = count) %>%
filter(row_number() > 1) %>%
mutate(date = lubridate::ymd(date)) %>%
mutate(count = as.numeric(count)) %>%
mutate(country = countryA)
B <-
B %>%
gather(key = date, value = count) %>%
filter(row_number() > 1) %>%
mutate(date = lubridate::ymd(date))%>%
mutate(count = as.numeric(count)) %>%
mutate(country = countryB)
GG <-
rbind(A,B)
return( ggplot(GG, aes(x = date, y = count, color = country)) +
stat_smooth(formula = y ~ x, method = "loess") +
ylab("Number of COVID-19 Cases") +
xlab("Date"))
}
This function allow us to create a direct comparison between two countries based on the date and the number of COVID-19 cases in each country. This way, we can compare the spread between nations and identify similar trends in different parts of the world.
compareCOVID("China", "United States")
compareCOVID("Japan", "Russia")
compareCOVID("Puerto Rico", "Belgium")
Ultimately, the contributing factors of the spread of COVID-19 still remains a mystery to us. We were able to find a strong positive correlation between the population of a country and the spread. This indicates that the more people a country has, the more likely the infection is to affect a large number of people. However, we struggled to find a strong correlation, once standardized and confounding factors were removed, between the population density of a country and its infection rate per million. This does not necessarily prove that density of living does not contribute to the spread of COVID-19– one fault present within this analysis is that population density was calculated from a country’s entire land area, whereas only a small portion of this land area may be inhabited. If we were to conduct this analysis again, we would seek out data which provides the inhabited living area of a country to more accurately calculate population density. Furthermore, as demonstrated by the graphics, many countries have been infected very recently, or not infected at all. The growth of country demonstrates in this short period of time may not be indicative of the overall growth trend that a country experiences. In the future, once we are more updated and more information about COVID-19 is released, some of the trends which we are seeking may be more relevant or easy to define. However, with the data we currently possess, it was not possible to make the conclusion– that population density is correlated with standardized COVID-19 spread– with confidence.